#########################################################################
# FigureE3.R
#########################################################################

library(tidyverse)

#########################################################################
# 1. Process Data
#########################################################################

dt <- read.csv("qualtrics_Alaska_nonnumeric.csv")[-c(1:2), ] %>%
  mutate(id = row_number())

#########################################################################
# 2. Forced Ranking
#########################################################################

# Reference choice set

ref_set <- c("Chesbro", "Kelley", "Murkowski", "Thibaka")

# Decompose ballot position x candidate
dtc <- dt %>% separate(C6_DO, into=c("C6_pos1", "C6_pos2", "C6_pos3", 
                                     "C6_pos4"), 
                       sep = "[|]", remove=F) %>% # Separate by "|"
  dplyr::select(id, starts_with("C6"),-c("C6_DO")) %>%
  pivot_longer(cols = C6_1:C6_4,
               values_to = "rank") %>%
  dplyr::select(id, name, rank, everything()) %>%
  mutate(name = case_when(name ==  "C6_1" ~ ref_set[1],
                          name ==  "C6_2" ~ ref_set[2],
                          name ==  "C6_3" ~ ref_set[3],
                          name ==  "C6_4" ~ ref_set[4]),
         rank = as.numeric(rank))


# Count the total occurrences of each 'name'
total_counts <- dtc %>% group_by(name) %>% summarise(total_count = n())

# Filter the dataframe for rows where rank is 1
filtered_data <- dtc %>% filter(rank == 1)

# Count the occurrences of each 'name' where rank is 1
name_counts <- filtered_data %>% group_by(name) %>% summarise(count = n())

# Merge total counts with counts where rank is 1
merged_data2 <- left_join(name_counts, total_counts, by = "name")

# Calculate the percentage and create a new column 'percentage'
merged_data2 <- merged_data2 %>% mutate(percentage = count / total_count * 100)

# Print the result
print(merged_data2)

################################################################################
### Estimate the voter share and compute SE

name_counts <- table(filtered_data$name)
name_proportions <- prop.table(name_counts)

# Compute standard errors
name_se <- sqrt(name_proportions * (1 - name_proportions) / sum(name_counts))

# Compute 95% confidence intervals
z_score_95 <- qnorm(0.975)  # Z-score for a 95% confidence interval
z_score_95 <- qnorm(0.95)  # Z-score for a 90% confidence interval

ci_lower <- name_proportions - z_score_95 * name_se
ci_upper <- name_proportions + z_score_95 * name_se

# Display results
result_table_full_sen <- data.frame(
  Name = names(name_proportions),
  Proportion = name_proportions,
  Standard_Error = name_se,
  CI_Lower = ci_lower,
  CI_Upper = ci_upper
)


# Multiply CI_Lower.Freq and CI_Upper.Freq by 100
result_table_full_sen$CI_Lower.Freq <- result_table_full_sen$CI_Lower.Freq * 100
result_table_full_sen$CI_Upper.Freq <- result_table_full_sen$CI_Upper.Freq * 100

result_table_full_sen$dataset <- "Forced-Ranking Question"
colnames(result_table_full_sen)[1] <- "name"

#########################################################################
# 3. Optional Ranking
#########################################################################

# Decompose ballot position x candidate

dtc <- dt %>% separate(C5_DO, into=c("C5_pos1", "C5_pos2", "C5_pos3", 
                                     "C5_pos4"), 
                       sep = "[|]", remove=F) %>% # Separate by "|"
  dplyr::select(id, starts_with("C5"),-c("C5_DO")) %>%
  pivot_longer(cols = C5_1:C5_4,
               values_to = "rank") %>%
  mutate(name = case_when(name ==  "C5_1" ~ ref_set[1],
                          name ==  "C5_2" ~ ref_set[2],
                          name ==  "C5_3" ~ ref_set[3],
                          name ==  "C5_4" ~ ref_set[4]),
         rank = as.numeric(rank),
         selection = ifelse(is.na(rank), 0, 1)) %>%
  dplyr::select(id, name, selection, rank, everything())



# Count the total occurrences of each 'name'
total_counts <- dtc %>% group_by(name) %>% summarise(total_count = n())

# Filter the dataframe for rows where rank is 1
filtered_data <- dtc %>% filter(rank == 1)

# Count the occurrences of each 'name' where rank is 1
name_counts <- filtered_data %>% group_by(name) %>% summarise(count = n())

# Merge total counts with counts where rank is 1
merged_data3 <- left_join(name_counts, total_counts, by = "name")

# Calculate the percentage and create a new column 'percentage'
merged_data3 <- merged_data3 %>% mutate(percentage = count / total_count * 100)

# Print the result
print(merged_data3)

################################################################################
### Estimate the voter share and compute SE

name_counts <- table(filtered_data$name)
name_proportions <- prop.table(name_counts)

# Compute standard errors
name_se <- sqrt(name_proportions * (1 - name_proportions) / sum(name_counts))

# Compute 95% confidence intervals
z_score_95 <- qnorm(0.975)  # Z-score for a 95% confidence interval
z_score_95 <- qnorm(0.95)  # Z-score for a 90% confidence interval

ci_lower <- name_proportions - z_score_95 * name_se
ci_upper <- name_proportions + z_score_95 * name_se

# Display results
result_table_par_sen <- data.frame(
  Name = names(name_proportions),
  Proportion = name_proportions,
  Standard_Error = name_se,
  CI_Lower = ci_lower,
  CI_Upper = ci_upper
)


# Multiply CI_Lower.Freq and CI_Upper.Freq by 100
result_table_par_sen$CI_Lower.Freq <- result_table_par_sen$CI_Lower.Freq * 100
result_table_par_sen$CI_Upper.Freq <- result_table_par_sen$CI_Upper.Freq * 100

result_table_par_sen$dataset <- "Optional-Ranking Question"
colnames(result_table_par_sen)[1] <- "name"


#########################################################################
# 4. Official Records
#########################################################################

# Combine the data frames into one for easier plotting
senate_combined_data <- bind_rows(
  mutate(merged_data2, dataset = "Forced-Ranking Question"),
  mutate(merged_data3, dataset = "Optional-Ranking Question")
)
# Reorder the 'name' factor based on 'percentage' in each dataset
senate_combined_data$name <- reorder(senate_combined_data$name, -senate_combined_data$percentage)

# Map official_election_vote_share values based on 'name'
vote_share_mapping <- c("Murkowski" = 43.4, "Thibaka" = 42.6, 
                        "Chesbro" = 10.7, "Kelley" = 3.3)
# Create a new data frame for the additional mapping results
additional_mapping <- data.frame(
  name = names(vote_share_mapping),
  percentage = unname(vote_share_mapping),
  dataset = "Actual Election Data"
)

# Append the additional mapping results to the senate_combined_data dataframe
senate_combined_data <- bind_rows(senate_combined_data, additional_mapping)

combined_table_sen <- rbind(result_table_par_sen, result_table_full_sen)

# Add the Confidence Interval Results into the final dataframe used for plotting 
senate_combined_data <- merge(senate_combined_data, combined_table_sen[, c("name", "dataset", "CI_Lower.Freq", "CI_Upper.Freq")], by = c("name", "dataset"), all.x = TRUE)

# Replace 'Thibaka' with 'Tshibaka'
senate_combined_data <- senate_combined_data %>%
  mutate(name = ifelse(name == "Thibaka", "Tshibaka", name))


#########################################################################
# 5. Generate Figure E3
#########################################################################

senate_combined_data <- senate_combined_data %>%
  mutate(new_var = ifelse(dataset == "Actual Election Data", percentage, NA)) %>%
  group_by(name) %>%
  fill(new_var, .direction = "down") %>%
  ungroup()

senate_combined_data$name <- fct_reorder(senate_combined_data$name, 
                                        senate_combined_data$new_var)


# Plot 3C - Scatter plot with error bars, differentiated by dataset
ggplot(senate_combined_data, aes(x = name, y = percentage, fill = dataset, shape = dataset, color = dataset)) +
  geom_point(position = position_dodge(width = 0.5), size = 3) +  # Scatter plot
  geom_errorbar(aes(ymin = CI_Lower.Freq, ymax = CI_Upper.Freq),
                position = position_dodge(width = 0.5), width = 0) +  # Error bars
  
  labs(title = "US Senate Election Results - Alaska",
       x = "Candidate Name",
       y = "Vote Share (%)") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 12),
        axis.title = element_text(size = 14),
        plot.title = element_text(size = 16, hjust = 0.5),
        legend.position = "bottom",
        legend.text = element_text(size = 12),
        legend.title = element_blank()) +
  scale_shape_manual(values = c(17, 16, 15)) +  # Use different point shapes
  scale_color_manual(values = c("darkblue", "darkred","darkgreen")) +  # Use different colors
  scale_fill_manual(values = c("darkblue", "darkred","darkgreen"))    # Use different fill colors

# Display the plot
ggsave("FigureE3.pdf", width = 10, height = 6)  # Specify the dimensions of the saved plot


#############################################################################
# END OF THIS R SOURCE FILE
#############################################################################
